from langchain_community.document_loaders import WebBaseLoader

if __name__ == '__main__':
    # 创建一个 WebBaseLoader Class 实例
    url = "https://datawhalechina.github.io/fantastic-matplotlib/%E7%AC%AC%E4%B8%80%E5%9B%9E%EF%BC%9AMatplotlib%E5%88%9D%E7%9B%B8%E8%AF%86/index.html"
    header = {'User-Agent': 'python-requests/2.27.1',
              'Accept-Encoding': 'gzip, deflate, br',
              'Accept': '*/*',
              'Connection': 'keep-alive'}
    loader = WebBaseLoader(web_path=url,header_template=header)
    # 调用 WebBaseLoader Class 的函数 load对文件进行加载
    pages = loader.load()

    print("Type of pages: ", type(pages))
    print("Length of pages: ", len(pages))
    page = pages[0]
    print("Type of page: ", type(page))
    print("Page_content: ", page.page_content[:500])
    print("Meta Data: ", page.metadata)

    # 去掉 page.page_content 中的所有空行
    content_without_empty_lines = "\n".join(line for line in page.page_content.splitlines() if line.strip())
    print("Page_content: ", content_without_empty_lines[:500])